train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)Where 0.75 is the percentage (75%) of the data to put in the Training set.
[1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
num [1:7] 1 4 7 10 13 16 19
[1] 1 2 3 4 100 58 5568
num [1:7] 1 2 3 4 100 ...
[1] "hey!"
[1] "jon" "Peter" "Sam"
chr [1:3] "jon" "Peter" "Sam"
chr [1:4] "Dec" "May" "Apr" "Dec"
Factor w/ 3 levels "Apr","Dec","May": 2 3 1 2
birth_month
Apr Dec May
1 2 1
month_levels <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
"Oct", "Nov", "Dec")
birth_month <- factor(birth_month, levels = month_levels)
str(birth_month) Factor w/ 12 levels "Jan","Feb","Mar",..: 12 5 4 12
[1] Dec May Apr
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
birth_month
Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
0 0 0 1 1 0 0 0 0 0 0 2
Can view help by vignette("dplyr") and vignette("two-table") or check out the online docs dplyr is a part of tidyverse
Can Use | as or
Aboriginal Arabic Aramaic Bosnian Cantonese
12 2 5 1 1 11
Chinese Czech Danish Dari Dutch Dzongkha
3 1 5 2 4 1
English Filipino French German Greek Hebrew
4704 1 73 19 1 5
Hindi Hungarian Icelandic Indonesian Italian Japanese
28 1 2 2 11 18
Kannada Kazakh Korean Mandarin Maya Mongolian
1 1 8 26 1 1
None Norwegian Panjabi Persian Polish Portuguese
2 4 1 4 4 8
Romanian Russian Slovenian Spanish Swahili Swedish
2 11 1 40 1 5
Tamil Telugu Thai Urdu Vietnamese Zulu
1 1 3 1 1 2
color director_name num_critic_for_reviews duration
1 Color Serdar Akar 16 122
2 Color Jehane Noujaim 68 108
director_facebook_likes actor_3_facebook_likes actor_2_name
1 11 173 Bergüzar Korel
2 63 5 Ahmed Hassan
actor_1_facebook_likes gross genres
1 205 NA Action|Adventure
2 161 NA Documentary|Drama|History|News
actor_1_name movie_title num_voted_users
1 Necati Sasmaz Valley of the Wolves: Iraq 14486
2 Khalid Abdalla The Square 6678
cast_total_facebook_likes actor_3_name facenumber_in_poster
1 808 Ghassan Massoud 3
2 176 Aida Elkashef 0
plot_keywords
1 abu ghraib|bomb|christian|explosion|iraq
2
movie_imdb_link
1 http://www.imdb.com/title/tt0493264/?ref_=fn_tt_tt_1
2 http://www.imdb.com/title/tt2486682/?ref_=fn_tt_tt_1
num_user_for_reviews language country content_rating budget title_year
1 159 Arabic Turkey 8300000 2006
2 42 Arabic Egypt Not Rated 1500000 2013
actor_2_facebook_likes imdb_score aspect_ratio movie_facebook_likes
1 197 6.0 1.85 467
2 10 8.1 1.85 0
[ reached 'max' / getOption("max.print") -- omitted 3 rows ]
moviesSub <- movies %>% filter(language == "English" | language == "Spanish")
moviesBig_Spanish <- movies %>% filter(language == "Spanish" | budget > 10000000000)
movies_eng_sp <- movies %>% filter(language == "English" | language == "Spanish") %>%
filter(budget > 100000000)
dim(movies_eng_sp)[1] 308 28
Order the rows
Only see certain rows
color director_name num_critic_for_reviews duration
1 Color Dan Scanlon 376 104
2 Color Barry Sonnenfeld 85 106
director_facebook_likes actor_3_facebook_likes actor_2_name
1 37 760 Tyler Labine
2 188 582 Salma Hayek
actor_1_facebook_likes gross
1 12000 268488329
2 10000 113745408
genres actor_1_name
1 Adventure|Animation|Comedy|Family|Fantasy Steve Buscemi
2 Action|Comedy|Sci-Fi|Western Will Smith
movie_title num_voted_users cast_total_facebook_likes
1 Monsters University 235025 14863
2 Wild Wild West 129601 15870
actor_3_name facenumber_in_poster
1 Sean Hayes 0
2 Bai Ling 2
plot_keywords
1 cheating|fraternity|monster|singing in a car|university
2 buddy movie|general|inventor|steampunk|utah
movie_imdb_link
1 http://www.imdb.com/title/tt1453405/?ref_=fn_tt_tt_1
2 http://www.imdb.com/title/tt0120891/?ref_=fn_tt_tt_1
num_user_for_reviews language country content_rating budget
1 265 English USA G 200000000
2 648 English USA PG-13 170000000
title_year actor_2_facebook_likes imdb_score aspect_ratio
1 2013 779 7.3 1.85
2 1999 4000 4.8 1.85
movie_facebook_likes
1 44000
2 0
color director_name num_critic_for_reviews duration
1 Color Joon-ho Bong 363 110
2 Color Carlos Saura 35 115
director_facebook_likes actor_3_facebook_likes actor_2_name
1 584 74 Kang-ho Song
2 98 4 Juan Luis Galiardo
actor_1_facebook_likes gross genres actor_1_name
1 629 2201412 Comedy|Drama|Horror|Sci-Fi Doona Bae
2 341 1687311 Drama|Musical Mía Maestro
movie_title num_voted_users cast_total_facebook_likes actor_3_name
1 The Host 68883 1173 Ah-sung Ko
2 Tango 2412 371 Miguel Ángel Solá
facenumber_in_poster plot_keywords
1 0 daughter|han river|monster|river|seoul
2 3 dancer|director|love|musical filmmaking|tango
movie_imdb_link
1 http://www.imdb.com/title/tt0468492/?ref_=fn_tt_tt_1
2 http://www.imdb.com/title/tt0120274/?ref_=fn_tt_tt_1
num_user_for_reviews language country content_rating budget
1 279 Korean South Korea R 12215500000
2 40 Spanish Spain PG-13 700000000
title_year actor_2_facebook_likes imdb_score aspect_ratio
1 2006 398 7.0 1.85
2 1998 26 7.2 2.00
movie_facebook_likes
1 7000
2 539
[ reached 'max' / getOption("max.print") -- omitted 8 rows ]
Used to pick out certain variables
[1] 5043 4
[1] "movie_title" "director_name" "gross" "budget"
[1] "movie_title" "director_name"
[3] "gross" "budget"
[5] "color" "num_critic_for_reviews"
[7] "duration" "director_facebook_likes"
[9] "actor_3_facebook_likes" "actor_2_name"
[11] "actor_1_facebook_likes" "genres"
[13] "actor_1_name" "num_voted_users"
[15] "cast_total_facebook_likes" "actor_3_name"
[17] "facenumber_in_poster" "plot_keywords"
[19] "movie_imdb_link" "num_user_for_reviews"
[21] "language" "country"
[23] "content_rating" "title_year"
[25] "actor_2_facebook_likes" "imdb_score"
[27] "aspect_ratio" "movie_facebook_likes"
starts_with(): Starts with a prefix.ends_with(): Ends with a suffix.contains(): Contains a literal string.matches(): Matches a regular expression.num_range(): Matches a numerical range like x01, x02, x03.one_of(): Matches variable names in a character vector.everything(): Matches all variables.last_col(): Select last variable, possibly with an offset. [1] "movie_title" "director"
[3] "gross" "budget"
[5] "color" "num_critic_for_reviews"
[7] "duration" "director_facebook_likes"
[9] "actor_3_facebook_likes" "actor_2_name"
[11] "actor_1_facebook_likes" "genres"
[13] "actor_1_name" "num_voted_users"
[15] "cast_total_facebook_likes" "actor_3_name"
[17] "facenumber_in_poster" "plot_keywords"
[19] "movie_imdb_link" "num_user_for_reviews"
[21] "language" "country"
[23] "content_rating" "title_year"
[25] "actor_2_facebook_likes" "imdb_score"
[27] "aspect_ratio" "movie_facebook_likes"
Director_Tot <- movies %>% group_by(director) %>% summarize(grossTotDir = sum(grossM,
na.rm = TRUE))
head(Director_Tot)# A tibble: 6 x 2
director grossTotDir
<fct> <dbl>
1 "" 0
2 A. Raven Cruz 0
3 Aaron Schneider 9.18
4 Aaron Seltzer 48.5
5 Abel Ferrara 1.23
6 Adam Carolla 0.106
Director_Stats <- movies %>% group_by(director) %>% summarize(n = n(), min = min(grossM),
max = max(grossM), avg = mean(grossM), sd = sd(grossM))
Director_Stats %>% arrange(desc(max)) %>% slice(1:10)# A tibble: 10 x 6
director n min max avg sd
<fct> <int> <dbl> <dbl> <dbl> <dbl>
1 James Cameron 7 38.4 761. 278. 301.
2 Colin Trevorrow 2 4.01 652. 328. 458.
3 Joss Whedon 4 25.3 623. 433. 282.
4 Christopher Nolan 8 25.5 533. 227. 187.
5 George Lucas 5 115 475. 348. 146.
6 Andrew Adamson 4 142. 436. 284. 121.
7 Francis Lawrence 5 58.7 425. 272. 135.
8 Gore Verbinski 7 12.5 423. 190. 154.
9 Roger Allers 2 84.3 423. 254. 239.
10 Lee Unkrich 1 415. 415. 415. NA
movies <- movies %>% mutate(genre_main = unlist(map(strsplit(as.character(movies$genres),
"\\|"), 1)), grossM = gross/1000000, budgetM = budget/1000000, profitM = grossM -
budgetM, ROI = profitM/budgetM)
movies <- movies %>% mutate(genre_main = factor(genre_main) %>% fct_drop())
Director_Avg <- movies %>% group_by(director) %>% summarize(num_movies = n(),
grossAvgDir = mean(grossM, na.rm = TRUE), profitAvgDir = mean(profit, na.rm = TRUE))
Director_Avg %>% arrange(desc(profitAvgDir)) %>% filter(num_movies > 1) %>%
slice(1:20)# A tibble: 20 x 4
director num_movies grossAvgDir profitAvgDir
<fct> <int> <dbl> <dbl>
1 George Lucas 5 348. 277.
2 Colin Trevorrow 2 328. 253.
3 Joss Whedon 4 433. 250.
4 Pierre Coffin 2 310. 237.
5 Roger Allers 2 254. 189.
6 James Cameron 7 278. 171.
7 Pete Docter 3 313. 158.
8 Francis Lawrence 5 272. 151.
9 Irvin Kershner 2 173. 146.
10 Andrew Adamson 4 284. 131.
11 Joel Zwick 2 145. 129.
12 George Roy Hill 2 131. 125.
13 Phil Lord 4 178. 115.
14 Gary Ross 3 183. 111.
15 Jon Favreau 7 223. 110.
16 Oren Peli 2 108. 108.
17 Victor Fleming 3 110. 107.
18 Robert Wise 3 123. 101.
19 Christopher Nolan 8 227. 101.
20 Leonard Nimoy 3 118. 100.
# A tibble: 20 x 4
director num_movies grossAvgDir profitAvgDir
<fct> <int> <dbl> <dbl>
1 Steven Spielberg 26 165. 99.5
2 Woody Allen 22 16.2 -0.00813
3 Clint Eastwood 20 72.5 32.1
4 Martin Scorsese 18 57.2 0.0194
5 Ridley Scott 17 78.7 -5.43
6 Spike Lee 16 21.9 5.26
7 Steven Soderbergh 16 65.7 25.5
8 Tim Burton 16 129. 51.5
9 Renny Harlin 15 34.9 -11.2
10 Oliver Stone 14 52.3 7.67
11 Barry Levinson 13 48.0 15.0
12 John Carpenter 13 24.3 9.57
13 Michael Bay 13 172. 49.6
14 Robert Rodriguez 13 45.4 16.9
15 Robert Zemeckis 13 125. 42.3
16 Ron Howard 13 105. 28.5
17 Sam Raimi 13 171. 51.6
18 Brian De Palma 12 50.1 10.3
19 Joel Schumacher 12 63.2 13.8
20 Kevin Smith 12 20.6 6.31
Director_Tot <- movies %>% group_by(director) %>% summarize(grossTotDir = sum(grossM,
na.rm = TRUE))
genre_collapse <- movies %>% group_by(genre_main) %>% summarize(Avg_ROI_genre = mean(ROI,
na.rm = TRUE), SD_ROI_genre = sd(ROI, na.rm = TRUE), SE_ROI_genre = sd(ROI,
na.rm = TRUE)/sqrt(n()), num_films = n())
actor_sum <- movies %>% group_by(actor_1_name) %>% summarize(Avg_ROI_actor = mean(ROI,
na.rm = TRUE), SD_ROI_actor = sd(ROI, na.rm = TRUE), SE_ROI_actor = sd(ROI,
na.rm = TRUE)/sqrt(n()), num_films = n())
actor_sum %>% filter(num_films > 2) %>% top_n(5, wt = Avg_ROI_actor)# A tibble: 5 x 5
actor_1_name Avg_ROI_actor SD_ROI_actor SE_ROI_actor num_films
<fct> <dbl> <dbl> <dbl> <int>
1 Gunnar Hansen 246. 211. 122. 3
2 Jamie Lee Curtis 47.0 70.2 20.3 12
3 Jon Heder 36.7 63.8 36.8 3
4 Madeline Kahn 37.4 10.7 4.79 5
5 Michael Emerson 22.3 32.1 18.5 3
# A tibble: 19 x 1
SE_ROI_genre
<dbl>
1 0.291
2 0.276
3 2.07
4 12.3
5 0.470
6 0.232
7 1.67
8 0.296
9 7.14
10 1.11
11 NA
12 NA
13 40.1
14 11.4
15 0.476
16 1.08
17 0.618
18 1.44
19 0.471
[1] 0.2909999 0.2755208 2.0667876 12.3284782 0.4697230 0.2315017
[7] 1.6732844 0.2963527 7.1413626 1.1121905 NA NA
[13] 40.1424230 11.4088478 0.4763281 1.0834822 0.6181190 1.4418917
[19] 0.4705548
Can view help by vignette(“magrittr”) or check out the online docs magrittr is a part of tidyverse
We start with a value, here mtcars (a data.frame). Based on this, we first extract a subset, then we aggregate the information based on the number of cylinders, and then we transform the dataset by adding a variable for kilometers per liter as supplement to miles per gallon. Finally we print the result before assigning it. Note how the code is arranged in the logical order of how you think about the task: data->transform->aggregate, which is also the same order as the code will execute. It’s like a recipe – easy to read, easy to follow!
library(magritter)
car_data <- mtcars %>% subset(hp > 100) %>% aggregate(. ~ cyl, data = ., FUN = . %>%
mean %>% round(2)) %>% transform(kpl = mpg %>% multiply_by(0.4251)) %>%
print cyl mpg disp hp drat wt qsec vs am gear carb kpl
1 4 25.90 108.05 111.00 3.94 2.15 17.75 1.00 1.00 4.50 2.00 11.010090
2 6 19.74 183.31 122.29 3.59 3.12 17.98 0.57 0.43 3.86 3.43 8.391474
3 8 15.10 353.10 209.21 3.23 4.00 16.77 0.00 0.14 3.29 3.50 6.419010
Note also how “building” a function on the fly for use in aggregate is very simple in magrittr: rather than an actual value as left-hand side in pipeline, just use the placeholder. This is also very useful in R’s *apply family of functions.
The combined example shows a few neat features of the pipe (which it is not):
%>% may be used in a nested fashion, e.g. it may appear in expressions within arguments. This is used in the mpg to kpl conversion.One feature, which was not utilized above is piping into anonymous functions, or lambdas. This is possible using standard function definitions, e.g.
cyl mpg disp hp drat wt qsec vs am gear carb kpl
1 4 25.9 108.05 111.00 3.94 2.15 17.75 1 1.00 4.50 2.0 11.01009
3 8 15.1 353.10 209.21 3.23 4.00 16.77 0 0.14 3.29 3.5 6.41901
However, magrittr also allows a short-hand notation:
cyl mpg disp hp drat wt qsec vs am gear carb kpl
1 4 25.9 108.05 111.00 3.94 2.15 17.75 1 1.00 4.50 2.0 11.01009
3 8 15.1 353.10 209.21 3.23 4.00 16.77 0 0.14 3.29 3.5 6.41901
The “tee” operator, %T>% works like %>%, except it returns the left-hand side value, and not the result of the right-hand side operation. This is useful when a step in a pipeline is used for its side-effect (printing, plotting, logging, etc.). As an example:
[1] 6.464832 -6.553345
The “exposition” pipe operator, %$% exposes the names within the left-hand side object to the right-hand side expression. Essentially, it is a short-hand for using the with functions (and the same left-hand side objects are accepted). This operator is handy when functions do not themselves have a data argument, as for example lm and aggregate do. Here are a few examples as illustration:
[1] 0.3361992
Finally, the compound assignment pipe operator %<>% can be used as the first pipe in a chain. The effect will be that the result of the pipeline is assigned to the left-hand side object, rather than returning the result as usual. It is essentially shorthand notation for expressions like foo <- foo %>% bar %>% baz, which boils down to foo %<>% bar %>% baz. Another example is
The %<>% can be used whenever expr <- … makes sense, e.g.
x %<>% foo %>% barx[1:10] %<>% foo %>% barx$baz %<>% foo %>% barggplot2 is a system for declaratively creating graphics, based on The Grammar of Graphics. You provide the data, tell ggplot2 how to map variables to aesthetics, what graphical primitives to use, and it takes care of the details.
It’s hard to succinctly describe how ggplot2 works because it embodies a deep philosophy of visualisation. However, in most cases you start with ggplot(), supply a dataset and aesthetic mapping (with aes()). You then add on layers (like geom_point() or geom_histogram()), scales (like scale_colour_brewer()), faceting specifications (like facet_wrap()) and coordinate systems (like coord_flip()).
geom_bar makes the height of the bar proportional to the number of cases in each group (or if the weight aesthetic is supplied, the sum of the weights). If you want the heights of the bars to represent values in the data, use geom_col() instead. geom_bar() uses stat_count() by default: it counts the number of cases at each x position. geom_col() uses stat_identity(): it leaves the data as is.
ggplot(gmsc_train, aes(y = SeriousDlqin2yrs, x = age)) + geom_bar(stat = "identity",
fill = "red") + ggtitle("Serious Delinquencies in 2 years vs By Age") +
labs(x = "Age", y = "Serious Delinquencies in 2 years")ggplot(gmsc_train, aes(y = SeriousDlqin2yrs, x = NumberRealEstateLoansOrLines)) +
geom_col(fill = "red") + ggtitle("Serious Delinquencies in 2 years vs Number of Loans or Lines") +
labs(x = "Number of Real Estate Loans Or Lines", y = "Serious Delinquencies in 2 years")The jitter geom is a convenient shortcut for geom_point(position = "jitter"). It adds a small amount of random variation to the location of each point, and is a useful way of handling overplotting caused by discreteness in smaller datasets.
ggplot(gmsc_train, aes(x = MonthlyIncome, y = SeriousDlqin2yrs)) + geom_jitter(alpha = 1/10,
color = "hotpink") + xlim(0, 100000) + ggtitle("Serious Delinquencies in 2 years vs Monthly Income") +
labs(x = "Monthly Income", y = "Serious Delinquencies in 2 years")Visualise the distribution of a single continuous variable by dividing the x axis into bins and counting the number of observations in each bin. Histograms (geom_histogram()) display the counts with bars; frequency polygons (geom_freqpoly()) display the counts with lines. Frequency polygons are more suitable when you want to compare the distribution across the levels of a categorical variable.
mpg %>% ggplot(aes(x = reorder(class, hwy), y = hwy, fill = class)) + geom_boxplot() +
xlab("class") + theme(legend.position = "none")In a line graph, observations are ordered by x value and connected.
The functions geom_line(), geom_step(), or geom_path() can be used.
x value (for x axis) can be :
Data derived from ToothGrowth data sets are used. ToothGrowth describes the effect of Vitamin C on tooth growth in Guinea pigs.
See more here
dose len
1 D0.5 4.2
2 D1 10.0
3 D2 29.5
# Basic line plot with points
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_line() + geom_point()
# Change the line type
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_line(linetype = "dashed") +
geom_point()
# Change the color
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_line(color = "red") +
geom_point()Observations can be also connected using the functions geom_step() or geom_path() :
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_step() + geom_point()
ggplot(data = df, aes(x = dose, y = len, group = 1)) + geom_path() + geom_point()ggplot(Genre_ROI, aes(y = ROI_avg, x = genre_main)) + geom_bar(stat = "identity") +
coord_flip() + ggtitle("Best Performing Genre By ROI") + geom_errorbar(mapping = aes(ymin = ROI_avg -
se_ROI, ymax = ROI_avg + se_ROI))library("ggthemes") docs
Base example ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length,color = Species))
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# For the complete list of themes in the package check
ls("package:ggthemes")[grepl("theme_", ls("package:ggthemes"))] [1] "theme_base" "theme_calc"
[3] "theme_clean" "theme_economist"
[5] "theme_economist_white" "theme_excel"
[7] "theme_excel_new" "theme_few"
[9] "theme_fivethirtyeight" "theme_foundation"
[11] "theme_gdocs" "theme_hc"
[13] "theme_igray" "theme_map"
[15] "theme_pander" "theme_par"
[17] "theme_solarized" "theme_solarized_2"
[19] "theme_solid" "theme_stata"
[21] "theme_tufte" "theme_wsj"
# Theme similar to the default settings of LibreOffice Calc
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_calc()# Style plots similar to those in The Economist
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_economist()# Style plots similar to those in The Economist
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_economist_white()# Theme based on the rules and examples in Stephen Few, 'Practical rules for
# using colors in charts'
ggplot() + geom_point(data = iris, aes(x = Petal.Width, y = Petal.Length, color = Species)) +
theme_few()library(ISLR)
library(tidyverse)
data("Default")
library(caret)
logit_fit3 <- glm(default ~ balance, family = binomial, data = Default)
preds_DF <- data.frame(preds = predict(logit_fit3, type = "response"), true = factor(Default$default,
levels = c("Yes", "No")))
creditLift <- lift(true ~ preds, data = preds_DF)
xyplot(creditLift, main = "X = balance")library("caret")
logit_fit2 <- glm(default ~ student, family = binomial, data = Default)
preds_DF <- data.frame(preds = predict(logit_fit2, type = "response"), true = factor(Default$default,
levels = c("Yes", "No")))
creditLift <- lift(true ~ preds, data = preds_DF)
xyplot(creditLift, main = "X = student")scores3DF <- data.frame(default = ifelse(Default$default == "Yes", 1, 0), scores = predict(logit_fit3,
type = "response"))
library(plyr)
calData <- ddply(scores3DF, .(cut(scores3DF$scores, c(0, 0.05, 0.15, 0.25, 0.35,
0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1))), colwise(mean))
calData$midpoint <- c(0.025, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.975)
colnames(calData) <- c("preds", "true", "midpoint")
calPlot <- ggplot(calData, aes(x = midpoint, y = true)) + geom_point() + ylim(0,
1) + geom_abline(intercept = 0, slope = 1, color = "red") + xlab("Prediction midpoint") +
ylab("Observed event percentage")
plot(calPlot)library("ROSE")
data_rose_down <- ROSE(default ~ ., data = Default, N = 666, p = 1/2)
table(data_rose_down$data$default)
No Yes
312 354
data_rose_up <- ROSE(default ~ ., data = Default, N = 12000, p = 1/2)
table(data_rose_up$data$default)
No Yes
6025 5975
# logit downsampled model
logit_down <- glm(default ~ balance, data = data_rose_down$data, family = "binomial")
summary(logit_down)
Call:
glm(formula = default ~ balance, family = "binomial", data = data_rose_down$data)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.76567 -0.28293 0.09854 0.38613 2.89963
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -7.1201686 0.5789729 -12.30 <0.0000000000000002 ***
balance 0.0055852 0.0004282 13.04 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 920.62 on 665 degrees of freedom
Residual deviance: 368.96 on 664 degrees of freedom
AIC: 372.96
Number of Fisher Scoring iterations: 6
# logit up-sampled
logit_up <- glm(default ~ balance, data = data_rose_up$data, family = "binomial")
summary(logit_up)
Call:
glm(formula = default ~ balance, family = "binomial", data = data_rose_up$data)
Deviance Residuals:
Min 1Q Median 3Q Max
-3.4497 -0.3822 -0.0351 0.4427 2.9333
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -6.6309657 0.1242331 -53.38 <0.0000000000000002 ***
balance 0.0050092 0.0000895 55.97 <0.0000000000000002 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 16635 on 11999 degrees of freedom
Residual deviance: 7577 on 11998 degrees of freedom
AIC: 7581
Number of Fisher Scoring iterations: 6
# vanilla logit
logit <- glm(default ~ balance, data = Default, family = "binomial")
# generate scores and class predictions
scores_down = predict(logit_down, type = "response")
scores_up = predict(logit_up, type = "response")
scores_reg = predict(logit, type = "response")
class_down = ifelse(scores_down > 0.5, 1, 0)
class_up = ifelse(scores_up > 0.5, 1, 0)
class_reg = ifelse(scores_reg > 0.5, 1, 0)
# > diagnostics(table(data_rose_down$data$default,class_down)) Accuracy:
# 0.872 TP: 307 TN: 274 Sensitivity: 0.9 Specificity: 0.843 False Pos Rate:
# 0.157 > diagnostics(table(data_rose_up$data$default,class_up)) Accuracy:
# 0.869 TP: 5282 TN: 5149 Sensitivity: 0.88 Specificity: 0.859 False Pos
# Rate: 0.141 > diagnostics(table(Default$default,class_reg))# Auto dataset
library(ISLR)
library(tidyverse)
set.seed(1861)
data(Auto)
Auto_sub <- Auto %>% select(-name)
head(Auto_sub) mpg cylinders displacement horsepower weight acceleration year origin
1 18 8 307 130 3504 12.0 70 1
2 15 8 350 165 3693 11.5 70 1
3 18 8 318 150 3436 11.0 70 1
4 16 8 304 150 3433 12.0 70 1
5 17 8 302 140 3449 10.5 70 1
6 15 8 429 198 4341 10.0 70 1
# for loop of model
mods_LOOCV <- list()
preds_LOOCV <- NULL
for (i in 1:nrow(Auto)) {
mod = lm(mpg ~ ., data = Auto_sub %>% slice(-i))
preds_LOOCV[i] <- predict(mod, newdata = slice(Auto_sub, i))
mods_LOOCV[[i]] <- mod
}
head(preds_LOOCV)[1] 14.92963 13.98084 15.17965 15.04054 14.91319 10.54207
mod_insample <- lm(mpg ~ ., data = Auto_sub)
# compute RMSE LOOCV
preds_DF <- data.frame(preds_LOOCV = preds_LOOCV, preds_insample = predict(mod_insample),
true = Auto$mpg)
library(caret)
RMSE(preds_DF$preds_LOOCV, preds_DF$true)[1] 3.37211
[1] 3.293551
[1] 0.8128915
[1] 0.8214781
Auto_sub <- mutate(Auto_sub, folds = createFolds(Auto_sub$mpg, k = 10, list = FALSE))
### K-Fold Cross Validation
nfolds <- 10
preds_10FoldCV_DF <- data.frame(folds = Auto_sub$folds, preds_10FoldCV = rep(NA,
nrow(Auto_sub)))
for (i in 1:nfolds) {
mod <- lm(mpg ~ ., data = Auto_sub %>% filter(folds != i))
preds <- predict(mod, newdata = filter(Auto_sub, folds == i))
preds_10FoldCV_DF[preds_10FoldCV_DF$folds == i, "preds_10FoldCV"] <- preds
}
preds_DF <- data.frame(preds_10FoldCV = preds_10FoldCV_DF$preds_10FoldCV, preds_DF)
RMSE(preds_DF$preds_10FoldCV, preds_DF$true)[1] 3.349023
[1] 3.37211
[1] 3.293551
[1] 0.8154466
[1] 0.8128915
[1] 0.8214781
B = 100 # number of bootstraped datasets
n_boot = 200 # size of each bootstrapped sample
coef_boot = NULL
for (b in 1:B) {
idx <- sample(1:nrow(Auto_sub), size = n_boot, replace = TRUE)
mod <- lm(mpg ~ displacement, data = Auto_sub %>% slice(idx))
coef_boot[b] <- mod$coefficients[2]
}
mod_lm <- lm(mpg ~ displacement, data = Auto_sub)
coef_boot <- data.frame(coef_boot = coef_boot)
ggplot(coef_boot, aes(x = coef_boot)) + geom_histogram() + geom_vline(xintercept = mod_lm$coefficients[2],
color = "red")library(rsample)
library(broom)
library(purrr)
library(boot)
boots <- bootstraps(Auto_sub, times = 100)
boots# Bootstrap sampling
# A tibble: 100 x 2
splits id
<list> <chr>
1 <split [392/147]> Bootstrap001
2 <split [392/145]> Bootstrap002
3 <split [392/145]> Bootstrap003
4 <split [392/148]> Bootstrap004
5 <split [392/140]> Bootstrap005
6 <split [392/141]> Bootstrap006
7 <split [392/130]> Bootstrap007
8 <split [392/140]> Bootstrap008
9 <split [392/149]> Bootstrap009
10 <split [392/147]> Bootstrap010
# … with 90 more rows
fit_lm_on_boots <- function(split) {
lm(mpg ~ displacement - 1, data = analysis(split))
}
boot_mods <- boots %>% mutate(model = map(splits, fit_lm_on_boots), coef_info = map(model,
tidy))
boot_coefs <- boot_mods %>% unnest(coef_info)
boot_coefs# A tibble: 100 x 8
splits id model term estimate std.error statistic p.value
<list> <chr> <lis> <chr> <dbl> <dbl> <dbl> <dbl>
1 <split [39… Bootstr… <lm> displa… 0.0770 0.00372 20.7 8.98e-65
2 <split [39… Bootstr… <lm> displa… 0.0841 0.00410 20.5 6.15e-64
3 <split [39… Bootstr… <lm> displa… 0.0835 0.00408 20.5 7.02e-64
4 <split [39… Bootstr… <lm> displa… 0.0784 0.00394 19.9 2.52e-61
5 <split [39… Bootstr… <lm> displa… 0.0770 0.00372 20.7 1.10e-64
6 <split [39… Bootstr… <lm> displa… 0.0799 0.00409 19.5 9.33e-60
7 <split [39… Bootstr… <lm> displa… 0.0799 0.00398 20.1 4.25e-62
8 <split [39… Bootstr… <lm> displa… 0.0857 0.00430 19.9 2.17e-61
9 <split [39… Bootstr… <lm> displa… 0.0821 0.00402 20.4 1.70e-63
10 <split [39… Bootstr… <lm> displa… 0.0815 0.00394 20.7 1.01e-64
# … with 90 more rows
library("leaps")
auto_fit_fwd <- regsubsets(mpg ~ ., data = Auto_sub, nvmax = 7, method = "forward")
summary(auto_fit_fwd)Subset selection object
Call: regsubsets.formula(mpg ~ ., data = Auto_sub, nvmax = 7, method = "forward")
8 Variables (and intercept)
Forced in Forced out
cylinders FALSE FALSE
displacement FALSE FALSE
horsepower FALSE FALSE
weight FALSE FALSE
acceleration FALSE FALSE
year FALSE FALSE
origin FALSE FALSE
folds FALSE FALSE
1 subsets of each size up to 7
Selection Algorithm: forward
cylinders displacement horsepower weight acceleration year origin
1 ( 1 ) " " " " " " "*" " " " " " "
2 ( 1 ) " " " " " " "*" " " "*" " "
3 ( 1 ) " " " " " " "*" " " "*" "*"
4 ( 1 ) " " "*" " " "*" " " "*" "*"
5 ( 1 ) " " "*" "*" "*" " " "*" "*"
6 ( 1 ) "*" "*" "*" "*" " " "*" "*"
7 ( 1 ) "*" "*" "*" "*" " " "*" "*"
folds
1 ( 1 ) " "
2 ( 1 ) " "
3 ( 1 ) " "
4 ( 1 ) " "
5 ( 1 ) " "
6 ( 1 ) " "
7 ( 1 ) "*"
auto_fit_bkwd <- regsubsets(mpg ~ ., data = Auto_sub, nvmax = 7, method = "backward")
summary(auto_fit_bkwd)Subset selection object
Call: regsubsets.formula(mpg ~ ., data = Auto_sub, nvmax = 7, method = "backward")
8 Variables (and intercept)
Forced in Forced out
cylinders FALSE FALSE
displacement FALSE FALSE
horsepower FALSE FALSE
weight FALSE FALSE
acceleration FALSE FALSE
year FALSE FALSE
origin FALSE FALSE
folds FALSE FALSE
1 subsets of each size up to 7
Selection Algorithm: backward
cylinders displacement horsepower weight acceleration year origin
1 ( 1 ) " " " " " " "*" " " " " " "
2 ( 1 ) " " " " " " "*" " " "*" " "
3 ( 1 ) " " " " " " "*" " " "*" "*"
4 ( 1 ) " " "*" " " "*" " " "*" "*"
5 ( 1 ) " " "*" "*" "*" " " "*" "*"
6 ( 1 ) "*" "*" "*" "*" " " "*" "*"
7 ( 1 ) "*" "*" "*" "*" " " "*" "*"
folds
1 ( 1 ) " "
2 ( 1 ) " "
3 ( 1 ) " "
4 ( 1 ) " "
5 ( 1 ) " "
6 ( 1 ) " "
7 ( 1 ) "*"
You can use str to get info about what is contained in a model ie: str(mod1) ## Setup Test/Train
train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)Where 0.75 is the percentage (75%) of the data to put in the Training set.
Linear regression is a linear approach to modeling the relationship between a scalar response (or dependent variable) and one or more explanatory variables (or independent variables). The case of one explanatory variable is called simple linear regression. Generate a linear model with lm(), desired formula is written with the dependant variable followed by ~ and then a list of the independant variables Can use . for all, or do something like y ~ -director Can get the coefficients like this mod1$coefficients[1]
Call:
lm(formula = gross ~ budget + duration, data = movies_train)
Residuals:
Min 1Q Median 3Q Max
-398660902 -23412577 -9492452 12647133 495725939
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -12355521.86555 4930418.20639 -2.506 0.0123
budget 0.99734 0.02389 41.752 < 0.0000000000000002
duration 229023.81048 45694.54514 5.012 0.000000571
(Intercept) *
budget ***
duration ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 53860000 on 2905 degrees of freedom
(496 observations deleted due to missingness)
Multiple R-squared: 0.4172, Adjusted R-squared: 0.4168
F-statistic: 1040 on 2 and 2905 DF, p-value: < 0.00000000000000022
Logistic regression is a statistical model that in its basic form uses a logistic function to model a binary dependent variable, although many more complex extensions exist. In regression analysis, logistic regression (or logit regression) is estimating the parameters of a logistic model (a form of binary regression). Use function glm() notice the family = binomial
library(ISLR)
data(Default)
options(scipen = 9)
logitMod1 <- glm(factor(default) ~ balance, family = binomial, data = Default)
summary(logitMod1)
Call:
glm(formula = factor(default) ~ balance, family = binomial, data = Default)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.2697 -0.1465 -0.0589 -0.0221 3.7589
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -10.6513306 0.3611574 -29.49 <2e-16 ***
balance 0.0054989 0.0002204 24.95 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2920.6 on 9999 degrees of freedom
Residual deviance: 1596.5 on 9998 degrees of freedom
AIC: 1600.5
Number of Fisher Scoring iterations: 8
(Intercept) balance
-10.6513 0.0055
library(glmnet)
library(glmnetUtils)
# load the movies dataset
library("tidyverse")
options(scipen = 50)
set.seed(1861)
movies <- read.csv(here::here("datasets", "movie_metadata.csv"))
movies <- movies %>% filter(budget < 400000000) %>% filter(content_rating !=
"", content_rating != "Not Rated", !is.na(gross))
movies <- movies %>% mutate(genre_main = unlist(map(strsplit(as.character(movies$genres),
"\\|"), 1)), grossM = gross/1000000, budgetM = budget/1000000, profitM = grossM -
budgetM)
movies <- movies %>% mutate(genre_main = fct_lump(genre_main, 5), content_rating = fct_lump(content_rating,
3), country = fct_lump(country, 2), cast_total_facebook_likes000s = cast_total_facebook_likes/1000,
) %>% drop_na()
train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)
# estimate ridge mod
Ridge_mod <- cv.glmnet(profitM ~ ., data = movies_train %>% select(-c(director_name,
actor_1_name, actor_2_name, actor_3_name, plot_keywords, movie_imdb_link,
country, budgetM, grossM, genres, language, movie_title)), alpha = 0)
coef(Ridge_mod)31 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 353.116523189860
color 3.220429457000
color Black and White -1.230889234459
colorColor 1.273946091430
num_critic_for_reviews -0.004287598868
duration -0.059435354254
director_facebook_likes -0.000095603299
actor_3_facebook_likes 0.000377629504
actor_1_facebook_likes -0.000046854225
gross 0.000000808363
num_voted_users 0.000026098607
cast_total_facebook_likes 0.000011936125
facenumber_in_poster 0.097107398006
num_user_for_reviews 0.001753590606
content_ratingPG 1.541944591659
content_ratingPG-13 -0.081014605835
content_ratingR -0.790656664619
content_ratingOther 0.262087870726
budget -0.000000764324
title_year -0.175902433003
actor_2_facebook_likes -0.000072922047
imdb_score 0.793676412085
aspect_ratio -1.260327725766
movie_facebook_likes 0.000031219836
genre_mainAction -2.508682376558
genre_mainAdventure -0.949174857631
genre_mainComedy 1.857795657356
genre_mainCrime -0.752570921384
genre_mainDrama 0.358810332201
genre_mainOther 1.660817121433
cast_total_facebook_likes000s 0.021206225810
library(glmnet)
library(glmnetUtils)
# load the movies dataset
library("tidyverse")
options(scipen = 50)
set.seed(1861)
movies <- read.csv(here::here("datasets", "movie_metadata.csv"))
movies <- movies %>% filter(budget < 400000000) %>% filter(content_rating !=
"", content_rating != "Not Rated", !is.na(gross))
movies <- movies %>% mutate(genre_main = unlist(map(strsplit(as.character(movies$genres),
"\\|"), 1)), grossM = gross/1000000, budgetM = budget/1000000, profitM = grossM -
budgetM)
movies <- movies %>% mutate(genre_main = fct_lump(genre_main, 5), content_rating = fct_lump(content_rating,
3), country = fct_lump(country, 2), cast_total_facebook_likes000s = cast_total_facebook_likes/1000) %>%
drop_na() %>% select(-c(director_name, actor_1_name, actor_2_name, actor_3_name,
plot_keywords, movie_imdb_link, country, budgetM, grossM, genres, language,
movie_title, budget, gross))
train_idx <- sample(1:nrow(movies), size = floor(0.75 * nrow(movies)))
movies_train <- movies %>% slice(train_idx)
movies_test <- movies %>% slice(-train_idx)
# estimate Lasso mod
Lasso_mod <-
cv.glmnet(profitM ~ ., data = movies_train, alpha = 1)
coef(Lasso_mod, s = Lasso_mod$lambda.min)29 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 1704.5386926912
color 7.8685464686
color Black and White -14.0683095963
colorColor .
num_critic_for_reviews -0.0114947113
duration -0.2776782023
director_facebook_likes -0.0005768620
actor_3_facebook_likes -0.0095188709
actor_1_facebook_likes -0.0081385588
num_voted_users 0.0001681242
cast_total_facebook_likes 0.0080395751
facenumber_in_poster 0.2666746699
num_user_for_reviews 0.0075827386
content_ratingPG 10.8248014912
content_ratingPG-13 .
content_ratingR -7.3220267644
content_ratingOther 1.7707853864
title_year -0.8436939485
actor_2_facebook_likes -0.0083079361
imdb_score 3.1091588898
aspect_ratio -5.7165591209
movie_facebook_likes 0.0001270866
genre_mainAction -11.7644203113
genre_mainAdventure -2.7208186957
genre_mainComedy 6.1636778997
genre_mainCrime -6.8162401714
genre_mainDrama .
genre_mainOther 5.3857834335
cast_total_facebook_likes000s 0.0141678626
29 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 987.3257625732
color .
color Black and White .
colorColor .
num_critic_for_reviews .
duration -0.0650488589
director_facebook_likes .
actor_3_facebook_likes 0.0010865538
actor_1_facebook_likes .
num_voted_users 0.0001630243
cast_total_facebook_likes .
facenumber_in_poster .
num_user_for_reviews .
content_ratingPG 5.7811040203
content_ratingPG-13 .
content_ratingR -4.3764504437
content_ratingOther .
title_year -0.4879797143
actor_2_facebook_likes .
imdb_score .
aspect_ratio -2.7584906529
movie_facebook_likes .
genre_mainAction -5.3686769625
genre_mainAdventure .
genre_mainComedy 3.9983665305
genre_mainCrime .
genre_mainDrama .
genre_mainOther .
cast_total_facebook_likes000s .
# put in a matrix
coef_mat <- data.frame(varname = rownames(coef(Lasso_mod)) %>% data.frame(),
Lasso_min = as.matrix(coef(Lasso_mod, s = Lasso_mod$lambda.min)) %>% round(3),
Lasso_1se = as.matrix(coef(Lasso_mod, s = Lasso_mod$lambda.1se)) %>% round(3)) %>%
rename(varname = 1, Lasso_min = 2, Lasso_1se = 3) %>% remove_rownames()
coef_mat varname Lasso_min Lasso_1se
1 (Intercept) 1704.539 987.326
2 color 7.869 0.000
3 color Black and White -14.068 0.000
4 colorColor 0.000 0.000
5 num_critic_for_reviews -0.011 0.000
6 duration -0.278 -0.065
7 director_facebook_likes -0.001 0.000
8 actor_3_facebook_likes -0.010 0.001
9 actor_1_facebook_likes -0.008 0.000
10 num_voted_users 0.000 0.000
11 cast_total_facebook_likes 0.008 0.000
12 facenumber_in_poster 0.267 0.000
13 num_user_for_reviews 0.008 0.000
14 content_ratingPG 10.825 5.781
15 content_ratingPG-13 0.000 0.000
16 content_ratingR -7.322 -4.376
17 content_ratingOther 1.771 0.000
18 title_year -0.844 -0.488
19 actor_2_facebook_likes -0.008 0.000
20 imdb_score 3.109 0.000
21 aspect_ratio -5.717 -2.758
22 movie_facebook_likes 0.000 0.000
23 genre_mainAction -11.764 -5.369
24 genre_mainAdventure -2.721 0.000
25 genre_mainComedy 6.164 3.998
[ reached 'max' / getOption("max.print") -- omitted 4 rows ]
[1] 0.00 0.25 0.50 0.75 1.00
Call:
cva.glmnet.formula(formula = profitM ~ ., data = movies_train,
alpha = alpha_list)
Model fitting options:
Sparse model matrix: FALSE
Use model.frame: FALSE
Alpha values: 0 0.25 0.5 0.75 1
Number of crossvalidation folds for lambda: 10
29 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 967.491
color .
color Black and White .
colorColor .
num_critic_for_reviews .
duration -0.057
director_facebook_likes .
actor_3_facebook_likes 0.001
actor_1_facebook_likes .
num_voted_users 0.000
cast_total_facebook_likes .
facenumber_in_poster .
num_user_for_reviews .
content_ratingPG 5.704
content_ratingPG-13 .
content_ratingR -4.308
content_ratingOther .
title_year -0.478
actor_2_facebook_likes .
imdb_score .
aspect_ratio -2.778
movie_facebook_likes .
genre_mainAction -5.158
genre_mainAdventure .
genre_mainComedy 3.928
genre_mainCrime .
genre_mainDrama .
genre_mainOther .
cast_total_facebook_likes000s .
# if we want the lambda.min version
coef(enet_fit, alpha = 0.75, s = enet_fit$modlist[[4]]$lambda.min)29 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 1704.8100887902
color 8.3117548645
color Black and White -14.1284756012
colorColor .
num_critic_for_reviews -0.0119109398
duration -0.2792141763
director_facebook_likes -0.0005735128
actor_3_facebook_likes -0.0098836808
actor_1_facebook_likes -0.0083783098
num_voted_users 0.0001680881
cast_total_facebook_likes 0.0056239902
facenumber_in_poster 0.2695339896
num_user_for_reviews 0.0076388802
content_ratingPG 10.8481088871
content_ratingPG-13 .
content_ratingR -7.3148179137
content_ratingOther 1.8032278465
title_year -0.8438615745
actor_2_facebook_likes -0.0085528071
imdb_score 3.1407490708
aspect_ratio -5.7212843610
movie_facebook_likes 0.0001285754
genre_mainAction -11.8516721784
genre_mainAdventure -2.8045684545
genre_mainComedy 6.0821880425
genre_mainCrime -6.8999275318
genre_mainDrama .
genre_mainOther 5.3565127462
cast_total_facebook_likes000s 2.6695073318
29 x 1 sparse Matrix of class "dgCMatrix"
1
(Intercept) 987.326
color .
color Black and White .
colorColor .
num_critic_for_reviews .
duration -0.065
director_facebook_likes .
actor_3_facebook_likes 0.001
actor_1_facebook_likes .
num_voted_users 0.000
cast_total_facebook_likes .
facenumber_in_poster .
num_user_for_reviews .
content_ratingPG 5.781
content_ratingPG-13 .
content_ratingR -4.376
content_ratingOther .
title_year -0.488
actor_2_facebook_likes .
imdb_score .
aspect_ratio -2.758
movie_facebook_likes .
genre_mainAction -5.369
genre_mainAdventure .
genre_mainComedy 3.998
genre_mainCrime .
genre_mainDrama .
genre_mainOther .
cast_total_facebook_likes000s .
enet_coefs <- data.frame(varname = rownames(coef(enet_fit, alpha = 0)), ridge = as.matrix(coef(enet_fit,
alpha = 0)) %>% round(3), alpha025 = as.matrix(coef(enet_fit, alpha = 0.25)) %>%
round(3), alpha05 = as.matrix(coef(enet_fit, alpha = 0.5)) %>% round(3),
alpha075 = as.matrix(coef(enet_fit, alpha = 0.75)) %>% round(3), lasso = as.matrix(coef(enet_fit,
alpha = 1)) %>% round(3)) %>% rename(varname = 1, ridge = 2, alpha025 = 3,
alpha05 = 4, alpha075 = 5, lasso = 6) %>% remove_rownames() 1 2 3 4 5
0.0013056797 0.0021125949 0.0085947405 0.0004344368 0.0017769574
6
0.0037041528
library("caret")
confusionMatrix(factor(ifelse(preds_DF$scores_mod1 > 0.5, "Yes", "No"), levels = c("Yes",
"No")), factor(preds_DF$default, levels = c("Yes", "No")))Confusion Matrix and Statistics
Reference
Prediction Yes No
Yes 100 42
No 233 9625
Accuracy : 0.9725
95% CI : (0.9691, 0.9756)
No Information Rate : 0.9667
P-Value [Acc > NIR] : 0.0004973
Kappa : 0.4093
Mcnemar's Test P-Value : < 0.00000000000000022
Sensitivity : 0.3003
Specificity : 0.9957
Pos Pred Value : 0.7042
Neg Pred Value : 0.9764
Prevalence : 0.0333
Detection Rate : 0.0100
Detection Prevalence : 0.0142
Balanced Accuracy : 0.6480
'Positive' Class : Yes
TrainDF <- data.frame(default = c(Default$default, Default$default), scores = c(preds_DF$scores_mod1,
preds_DF$scores_mod2), models = c(rep("X = Student", length(preds_DF$scores_mod1)),
rep("X = Student + Balance + Income", length(preds_DF$scores_mod2))))
library(ggplot2)
library("plotROC")
TrainROC <- ggplot(TrainDF, aes(m = scores, d = default, color = models)) +
geom_roc(show.legend = TRUE, labelsize = 3.5, cutoffs.at = c(0.99, 0.9,
0.7, 0.5, 0.3, 0.1, 0))
TrainROC <- TrainROC + style_roc(theme = theme_grey) + theme(axis.text = element_text(colour = "blue")) +
theme(legend.justification = c(1, 0), legend.position = c(1, 0), legend.box.margin = margin(c(50,
50, 50, 50)))
plot(TrainROC)load library(plyr) before library(tidyverse)
scores3DF <- data.frame(default = ifelse(Default$default == "Yes", 1, 0), scores = preds_DF$scores_mod2)
calData <- ddply(scores3DF, .(cut(scores3DF$scores, c(0, 0.05, 0.15, 0.25, 0.35,
0.45, 0.55, 0.65, 0.75, 0.85, 0.95, 1))), colwise(mean))
calData$midpoint <- c(0.025, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.975)
colnames(calData) <- c("preds", "true", "midpoint")
calPlot <- ggplot(calData, aes(x = midpoint, y = true)) + geom_point() + ylim(0,
1) + geom_abline(intercept = 0, slope = 1, color = "red") + xlab("Prediction midpoint") +
ylab("Observed event percentage")
plot(calPlot)R uses factors to handle categorical variables, variables that have a fixed and known set of possible values. Factors are also helpful for reordering character vectors to improve display. The goal of the forcats package is to provide a suite of tools that solve common problems with factors, including changing the order of levels or the values. Some examples include:
fct_reorder(): Reordering a factor by another variable.fct_infreq(): Reordering a factor by the frequency of values.fct_relevel(): Changing the order of a factor by hand.fct_lump(): Collapsing the least/most frequent values of a factor into “other”.